Use raw fastq and generate the quality plots to asses the quality of reads
Filter and trim out bad sequences and bases from our sequencing files
Write out fastq files with high quality sequences
Evaluate the quality from our filter and trim.
Infer errors on forward and reverse reads individually
Identified ASVs on forward and reverse reads separately using the error model.
Merge forward and reverse ASVs into “contigous ASVs”.
Generate ASV count table. (otu_table input for
phyloseq.).
ASV count table: otu_table
Taxonomy table tax_table
Sample information: sample_table track the reads
lost throughout DADA2 workflow.
#Set the raw fastq path to the raw sequencing files
#Path to the fastq files
raw_fastqs_path <- "data/01_DADA2/01_raw_gzipped_fastqs/"
raw_fastqs_path## [1] "data/01_DADA2/01_raw_gzipped_fastqs/"
## [1] "SRR17060816_1.fastq.gz" "SRR17060816_2.fastq.gz" "SRR17060817_1.fastq.gz"
## [4] "SRR17060817_2.fastq.gz" "SRR17060818_1.fastq.gz" "SRR17060818_2.fastq.gz"
## [7] "SRR17060819_1.fastq.gz" "SRR17060819_2.fastq.gz" "SRR17060820_1.fastq.gz"
## [10] "SRR17060820_2.fastq.gz" "SRR17060821_1.fastq.gz" "SRR17060821_2.fastq.gz"
## [13] "SRR17060822_1.fastq.gz" "SRR17060822_2.fastq.gz" "SRR17060823_1.fastq.gz"
## [16] "SRR17060823_2.fastq.gz" "SRR17060824_1.fastq.gz" "SRR17060824_2.fastq.gz"
## [19] "SRR17060825_1.fastq.gz" "SRR17060825_2.fastq.gz" "SRR17060826_1.fastq.gz"
## [22] "SRR17060826_2.fastq.gz" "SRR17060827_1.fastq.gz" "SRR17060827_2.fastq.gz"
## [25] "SRR17060828_1.fastq.gz" "SRR17060828_2.fastq.gz" "SRR17060829_1.fastq.gz"
## [28] "SRR17060829_2.fastq.gz" "SRR17060830_1.fastq.gz" "SRR17060830_2.fastq.gz"
## [31] "SRR17060831_1.fastq.gz" "SRR17060831_2.fastq.gz" "SRR17060832_1.fastq.gz"
## [34] "SRR17060832_2.fastq.gz" "SRR17060833_1.fastq.gz" "SRR17060833_2.fastq.gz"
## [37] "SRR17060834_1.fastq.gz" "SRR17060834_2.fastq.gz" "SRR17060835_1.fastq.gz"
## [40] "SRR17060835_2.fastq.gz" "SRR17060836_1.fastq.gz" "SRR17060836_2.fastq.gz"
## [43] "SRR17060837_1.fastq.gz" "SRR17060837_2.fastq.gz" "SRR17060838_1.fastq.gz"
## [46] "SRR17060838_2.fastq.gz" "SRR17060839_1.fastq.gz" "SRR17060839_2.fastq.gz"
## [49] "SRR17060840_1.fastq.gz" "SRR17060840_2.fastq.gz" "SRR17060841_1.fastq.gz"
## [52] "SRR17060841_2.fastq.gz" "SRR17060842_1.fastq.gz" "SRR17060842_2.fastq.gz"
## [55] "SRR17060843_1.fastq.gz" "SRR17060843_2.fastq.gz" "SRR17060844_1.fastq.gz"
## [58] "SRR17060844_2.fastq.gz" "SRR17060845_1.fastq.gz" "SRR17060845_2.fastq.gz"
## [61] "SRR17060846_1.fastq.gz" "SRR17060846_2.fastq.gz" "SRR17060847_1.fastq.gz"
## [64] "SRR17060847_2.fastq.gz"
## chr [1:64] "SRR17060816_1.fastq.gz" "SRR17060816_2.fastq.gz" ...
#Create a vector of forward reads
forward_reads <- list.files(raw_fastqs_path, pattern = "_1.fastq.gz", full.names = TRUE)
#Intuition check
head(forward_reads)## [1] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060816_1.fastq.gz"
## [2] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060817_1.fastq.gz"
## [3] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060818_1.fastq.gz"
## [4] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060819_1.fastq.gz"
## [5] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060820_1.fastq.gz"
## [6] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060821_1.fastq.gz"
#Create a vector of reverse reads
reverse_reads <-list.files(raw_fastqs_path, pattern = "_2.fastq.gz", full.names = TRUE)
#Intuition check
head(reverse_reads)## [1] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060816_2.fastq.gz"
## [2] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060817_2.fastq.gz"
## [3] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060818_2.fastq.gz"
## [4] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060819_2.fastq.gz"
## [5] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060820_2.fastq.gz"
## [6] "data/01_DADA2/01_raw_gzipped_fastqs//SRR17060821_2.fastq.gz"
# Randomly select 12 samples from dataset to evaluate
# Selecting 12 is typically better than 2 (like we did in class for efficiency)
random_samples <- sample(1:length(reverse_reads), size = 12)
random_samples## [1] 16 22 15 1 14 6 30 27 11 13 23 32
# Calculate and plot quality of these two samples
forward_filteredQual_plot_12 <- plotQualityProfile(forward_reads[random_samples]) +
labs(title = "Forward Read: Raw Quality")
reverse_filteredQual_plot_12 <- plotQualityProfile(reverse_reads[random_samples]) +
labs(title = "Reverse Read: Raw Quality")
# Plot them together with patchwork
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12# Aggregate all QC plots
# Forward reads
forward_preQC_plot <-
plotQualityProfile(forward_reads, aggregate = TRUE) +
labs(title = "Forward Pre-QC")
# reverse reads
reverse_preQC_plot <-
plotQualityProfile(reverse_reads, aggregate = TRUE) +
labs(title = "Reverse Pre-QC")
preQC_aggregate_plot <-
# Plot the forward and reverse together
forward_preQC_plot + reverse_preQC_plot
# Show the plot
preQC_aggregate_plot# vector of our samples, extract the sample information from our file
samples <- sapply(strsplit(basename(forward_reads), "_"), `[`,1)
#Intuition check
head(samples)## [1] "SRR17060816" "SRR17060817" "SRR17060818" "SRR17060819" "SRR17060820"
## [6] "SRR17060821"
#place filtered reads into filtered_fastqs_path
filtered_fastqs_path <- "data/01_DADA2/08_filtered_fullLength/"
filtered_fastqs_path## [1] "data/01_DADA2/08_filtered_fullLength/"
# create 2 variables : filtered_F, filtered_R
filtered_forward_reads <-
file.path(filtered_fastqs_path, paste0(samples, "_R1_filtered.fastq.gz"))
#Intuition check
head(filtered_forward_reads)## [1] "data/01_DADA2/08_filtered_fullLength//SRR17060816_R1_filtered.fastq.gz"
## [2] "data/01_DADA2/08_filtered_fullLength//SRR17060817_R1_filtered.fastq.gz"
## [3] "data/01_DADA2/08_filtered_fullLength//SRR17060818_R1_filtered.fastq.gz"
## [4] "data/01_DADA2/08_filtered_fullLength//SRR17060819_R1_filtered.fastq.gz"
## [5] "data/01_DADA2/08_filtered_fullLength//SRR17060820_R1_filtered.fastq.gz"
## [6] "data/01_DADA2/08_filtered_fullLength//SRR17060821_R1_filtered.fastq.gz"
## [1] 32
filtered_reverse_reads <-
file.path(filtered_fastqs_path, paste0(samples, "_R2_filtered.fastq.gz"))
#Intuition check
length(filtered_reverse_reads)## [1] 32
Parameters of filter and trim DEPEND ON THE DATASET
maxN = number of N bases. Remove all Ns from the
data.maxEE = quality filtering threshold applied to expected
errors. By default, all expected errors. Mar recommends using c(1,1).
Here, if there is maxEE expected errors, its okay. If more, throw away
sequence.trimLeft = trim certain number of base pairs on start
of each readtruncQ = truncate reads at the first instance of a
quality score less than or equal to selected number. Chose 2rm.phix = remove phi xcompress = make filtered files .gzippedmultithread = multithread#Assign a vector to filtered reads
#Trim out poor bases
#Write out filtered fastq files
filtered_reads <-
filterAndTrim(fwd = forward_reads, filt = filtered_forward_reads,
rev = reverse_reads, filt.rev = filtered_reverse_reads,
truncLen = c(245,245), trimLeft = c(17,21),
maxN = 0, maxEE = c(1, 1),truncQ = 2, rm.phix = TRUE,
compress = TRUE, multithread = 6)
# Primers are V3-V4
# 341F (5′-CCT ACG GGN GGC WGC AG-3′) (17 bp) from Herlemann et al. 2011
# 806R (5′-GGA CTA CHV GGG TWT CTA AT-3′) (20 bp) from Caporaso et al. 2011
# 785R (5′-GAC TAC HVG GGT ATC TAA TCC-3′)(21 bp) from Herlemann et al. 2011# Plot the 12 random samples after QC
forward_filteredQual_plot_12 <-
plotQualityProfile(filtered_forward_reads[random_samples]) +
labs(title = "Trimmed Forward Read Quality")
reverse_filteredQual_plot_12 <-
plotQualityProfile(filtered_reverse_reads[random_samples]) +
labs(title = "Trimmed Reverse Read Quality")
# Put the two plots together
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12# Aggregate all QC plots
# Forward reads
forward_postQC_plot <-
plotQualityProfile(filtered_forward_reads, aggregate = TRUE) +
labs(title = "Forward Post-QC")
# reverse reads
reverse_postQC_plot <-
plotQualityProfile(filtered_reverse_reads, aggregate = TRUE) +
labs(title = "Reverse Post-QC")
postQC_aggregate_plot <-
# Plot the forward and reverse together
forward_postQC_plot + reverse_postQC_plot
# Show the plot
postQC_aggregate_plotfilterAndTrim## reads.in reads.out
## SRR17060816_1.fastq.gz 285558 78567
## SRR17060817_1.fastq.gz 676817 139571
## SRR17060818_1.fastq.gz 591364 144048
## SRR17060819_1.fastq.gz 379452 101847
## SRR17060820_1.fastq.gz 570270 145512
## SRR17060821_1.fastq.gz 556682 149757
# calculate some stats
filtered_df %>%
reframe(median_reads_in = median(reads.in),
median_reads_out = median(reads.out),
median_percent_retained = (median(reads.out)/median(reads.in)))## median_reads_in median_reads_out median_percent_retained
## 1 294748.5 71079.5 0.2411531
About 24 percent of reads are retained when maxEE = 1. About 50 percent of reads are retained when maxEE = 2.
filterAndTrim()
more? If so, which parameters?Note every sequencing run needs to be run
separately! The error model MUST be run separately on
each illumina dataset. If you’d like to combine the datasets from
multiple sequencing runs, you’ll need to do the exact same
filterAndTrim() step AND, very importantly, you’ll
need to have the same primer and ASV length expected by the output.
Infer error rates for all possible transitions within purines and pyrimidines (A<>G or C<>T) and transversions between all purine and pyrimidine combinations.
Error model is learned by alternating estimation of the error rates and inference of sample composition until they converge.
## 105799524 total bases in 464033 reads from 4 samples will be used for learning the error rates.
#Plot forward reads errors
forward_error_plot <-
plotErrors(error_forward_reads, nominalQ = TRUE) +
labs(title = "Forward Read Error Model")
#Reverse reads
error_reverse_reads <-
learnErrors(filtered_reverse_reads, multithread = 6)## 103943392 total bases in 464033 reads from 4 samples will be used for learning the error rates.
#Plot reverse reads errors
reverse_error_plot <-
plotErrors(error_reverse_reads, nominalQ = TRUE) +
labs(title = "Reverse Read Error Model")
#Put the two plots together
forward_error_plot + reverse_error_plot## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
The points do a pretty good job of matching the black lines.
Details of the plot: - Points: The observed error
rates for each consensus quality score.
- Black line: Estimated error rates after convergence
of the machine-learning algorithm.
- Red line: The error rates expected under the nominal
definition of the Q-score.
Similar to what is mentioned in the dada2 tutorial: the estimated error rates (black line) are a “reasonably good” fit to the observed rates (points), and the error rates drop with increased quality as expected. We can now infer ASVs!
An important note: This process occurs separately on forward and reverse reads! This is quite a different approach from how OTUs are identified in Mothur and also from UCHIME, oligotyping, and other OTU, MED, and ASV approaches.
#Infer forward ASVs
dada_forward <- dada(filtered_forward_reads,
err = error_forward_reads,
multithread = 6)## Sample 1 - 78567 reads in 17900 unique sequences.
## Sample 2 - 139571 reads in 7180 unique sequences.
## Sample 3 - 144048 reads in 11602 unique sequences.
## Sample 4 - 101847 reads in 16976 unique sequences.
## Sample 5 - 145512 reads in 12861 unique sequences.
## Sample 6 - 149757 reads in 11057 unique sequences.
## Sample 7 - 13757 reads in 4558 unique sequences.
## Sample 8 - 15877 reads in 6912 unique sequences.
## Sample 9 - 99835 reads in 11681 unique sequences.
## Sample 10 - 10711 reads in 4326 unique sequences.
## Sample 11 - 7737 reads in 3241 unique sequences.
## Sample 12 - 6101 reads in 2262 unique sequences.
## Sample 13 - 11420 reads in 4103 unique sequences.
## Sample 14 - 337 reads in 264 unique sequences.
## Sample 15 - 8561 reads in 2169 unique sequences.
## Sample 16 - 136021 reads in 16003 unique sequences.
## Sample 17 - 73763 reads in 7893 unique sequences.
## Sample 18 - 139426 reads in 16818 unique sequences.
## Sample 19 - 139621 reads in 25857 unique sequences.
## Sample 20 - 142670 reads in 21399 unique sequences.
## Sample 21 - 142146 reads in 9535 unique sequences.
## Sample 22 - 80696 reads in 16324 unique sequences.
## Sample 23 - 142257 reads in 13663 unique sequences.
## Sample 24 - 369 reads in 188 unique sequences.
## Sample 25 - 14449 reads in 4132 unique sequences.
## Sample 26 - 11750 reads in 2914 unique sequences.
## Sample 27 - 11373 reads in 3197 unique sequences.
## Sample 28 - 10639 reads in 3937 unique sequences.
## Sample 29 - 8736 reads in 4094 unique sequences.
## Sample 30 - 477 reads in 236 unique sequences.
## Sample 31 - 113934 reads in 16629 unique sequences.
## Sample 32 - 68396 reads in 9086 unique sequences.
#Infer reverse ASVs
dada_reverse <- dada(filtered_reverse_reads,
err = error_reverse_reads,
multithread = 6)## Sample 1 - 78567 reads in 20028 unique sequences.
## Sample 2 - 139571 reads in 19054 unique sequences.
## Sample 3 - 144048 reads in 20465 unique sequences.
## Sample 4 - 101847 reads in 29160 unique sequences.
## Sample 5 - 145512 reads in 22937 unique sequences.
## Sample 6 - 149757 reads in 23158 unique sequences.
## Sample 7 - 13757 reads in 4372 unique sequences.
## Sample 8 - 15877 reads in 7106 unique sequences.
## Sample 9 - 99835 reads in 20534 unique sequences.
## Sample 10 - 10711 reads in 4139 unique sequences.
## Sample 11 - 7737 reads in 3149 unique sequences.
## Sample 12 - 6101 reads in 2088 unique sequences.
## Sample 13 - 11420 reads in 4227 unique sequences.
## Sample 14 - 337 reads in 255 unique sequences.
## Sample 15 - 8561 reads in 3135 unique sequences.
## Sample 16 - 136021 reads in 32056 unique sequences.
## Sample 17 - 73763 reads in 15574 unique sequences.
## Sample 18 - 139426 reads in 26862 unique sequences.
## Sample 19 - 139621 reads in 39231 unique sequences.
## Sample 20 - 142670 reads in 37812 unique sequences.
## Sample 21 - 142146 reads in 20879 unique sequences.
## Sample 22 - 80696 reads in 28248 unique sequences.
## Sample 23 - 142257 reads in 24296 unique sequences.
## Sample 24 - 369 reads in 200 unique sequences.
## Sample 25 - 14449 reads in 4753 unique sequences.
## Sample 26 - 11750 reads in 3274 unique sequences.
## Sample 27 - 11373 reads in 2706 unique sequences.
## Sample 28 - 10639 reads in 3392 unique sequences.
## Sample 29 - 8736 reads in 4307 unique sequences.
## Sample 30 - 477 reads in 255 unique sequences.
## Sample 31 - 113934 reads in 28138 unique sequences.
## Sample 32 - 68396 reads in 18222 unique sequences.
## $SRR17060816_R1_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 414 sequence variants were inferred from 17900 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060816_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 346 sequence variants were inferred from 20028 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060827_R1_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 46 sequence variants were inferred from 2262 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## $SRR17060827_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 28 sequence variants were inferred from 2088 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
Now, merge the forward and reverse ASVs into contigs.
# merge forward and reverse ASVs
merged_ASVs <- mergePairs(dada_forward, filtered_forward_reads,
dada_reverse, filtered_reverse_reads,
verbose = TRUE)## 75231 paired-reads (in 1141 unique pairings) successfully merged out of 77260 (in 2336 pairings) input.
## 138140 paired-reads (in 573 unique pairings) successfully merged out of 138942 (in 941 pairings) input.
## 142515 paired-reads (in 521 unique pairings) successfully merged out of 143329 (in 888 pairings) input.
## 100141 paired-reads (in 733 unique pairings) successfully merged out of 100889 (in 1074 pairings) input.
## 144552 paired-reads (in 446 unique pairings) successfully merged out of 145020 (in 675 pairings) input.
## 148687 paired-reads (in 416 unique pairings) successfully merged out of 149200 (in 562 pairings) input.
## 12620 paired-reads (in 118 unique pairings) successfully merged out of 13440 (in 332 pairings) input.
## 13435 paired-reads (in 287 unique pairings) successfully merged out of 15024 (in 785 pairings) input.
## 98468 paired-reads (in 611 unique pairings) successfully merged out of 99336 (in 1026 pairings) input.
## 9091 paired-reads (in 130 unique pairings) successfully merged out of 10090 (in 516 pairings) input.
## 6634 paired-reads (in 119 unique pairings) successfully merged out of 7138 (in 315 pairings) input.
## 5300 paired-reads (in 50 unique pairings) successfully merged out of 5871 (in 172 pairings) input.
## 10363 paired-reads (in 123 unique pairings) successfully merged out of 11167 (in 315 pairings) input.
## 154 paired-reads (in 12 unique pairings) successfully merged out of 191 (in 14 pairings) input.
## 8219 paired-reads (in 161 unique pairings) successfully merged out of 8386 (in 182 pairings) input.
## 135381 paired-reads (in 428 unique pairings) successfully merged out of 135828 (in 517 pairings) input.
## 72848 paired-reads (in 427 unique pairings) successfully merged out of 73333 (in 517 pairings) input.
## 138154 paired-reads (in 512 unique pairings) successfully merged out of 138789 (in 768 pairings) input.
## 134373 paired-reads (in 2043 unique pairings) successfully merged out of 136915 (in 2697 pairings) input.
## 140773 paired-reads (in 924 unique pairings) successfully merged out of 141840 (in 1300 pairings) input.
## 141576 paired-reads (in 219 unique pairings) successfully merged out of 141821 (in 316 pairings) input.
## 76425 paired-reads (in 1687 unique pairings) successfully merged out of 78407 (in 2103 pairings) input.
## 141283 paired-reads (in 507 unique pairings) successfully merged out of 141834 (in 723 pairings) input.
## 308 paired-reads (in 11 unique pairings) successfully merged out of 316 (in 13 pairings) input.
## 12964 paired-reads (in 118 unique pairings) successfully merged out of 13765 (in 392 pairings) input.
## 11345 paired-reads (in 64 unique pairings) successfully merged out of 11604 (in 101 pairings) input.
## 10864 paired-reads (in 73 unique pairings) successfully merged out of 11074 (in 150 pairings) input.
## 9381 paired-reads (in 99 unique pairings) successfully merged out of 10254 (in 322 pairings) input.
## 6743 paired-reads (in 221 unique pairings) successfully merged out of 7816 (in 770 pairings) input.
## 416 paired-reads (in 12 unique pairings) successfully merged out of 429 (in 15 pairings) input.
## 111112 paired-reads (in 675 unique pairings) successfully merged out of 113226 (in 1350 pairings) input.
## 67247 paired-reads (in 360 unique pairings) successfully merged out of 67856 (in 610 pairings) input.
## [1] "list"
## [1] 32
## [1] "SRR17060816_R1_filtered.fastq.gz" "SRR17060817_R1_filtered.fastq.gz"
## [3] "SRR17060818_R1_filtered.fastq.gz" "SRR17060819_R1_filtered.fastq.gz"
## [5] "SRR17060820_R1_filtered.fastq.gz" "SRR17060821_R1_filtered.fastq.gz"
## [7] "SRR17060822_R1_filtered.fastq.gz" "SRR17060823_R1_filtered.fastq.gz"
## [9] "SRR17060824_R1_filtered.fastq.gz" "SRR17060825_R1_filtered.fastq.gz"
## [11] "SRR17060826_R1_filtered.fastq.gz" "SRR17060827_R1_filtered.fastq.gz"
## [13] "SRR17060828_R1_filtered.fastq.gz" "SRR17060829_R1_filtered.fastq.gz"
## [15] "SRR17060830_R1_filtered.fastq.gz" "SRR17060831_R1_filtered.fastq.gz"
## [17] "SRR17060832_R1_filtered.fastq.gz" "SRR17060833_R1_filtered.fastq.gz"
## [19] "SRR17060834_R1_filtered.fastq.gz" "SRR17060835_R1_filtered.fastq.gz"
## [21] "SRR17060836_R1_filtered.fastq.gz" "SRR17060837_R1_filtered.fastq.gz"
## [23] "SRR17060838_R1_filtered.fastq.gz" "SRR17060839_R1_filtered.fastq.gz"
## [25] "SRR17060840_R1_filtered.fastq.gz" "SRR17060841_R1_filtered.fastq.gz"
## [27] "SRR17060842_R1_filtered.fastq.gz" "SRR17060843_R1_filtered.fastq.gz"
## [29] "SRR17060844_R1_filtered.fastq.gz" "SRR17060845_R1_filtered.fastq.gz"
## [31] "SRR17060846_R1_filtered.fastq.gz" "SRR17060847_R1_filtered.fastq.gz"
# Create the ASV Count Table
raw_ASV_table <- makeSequenceTable(merged_ASVs)
# Write out the file to data/01_DADA2
# Check the type and dimensions of the data
dim(raw_ASV_table)## [1] 32 8827
## [1] "matrix" "array"
## [1] "integer"
# Inspect the distribution of sequence lengths of all ASVs in dataset
table(nchar(getSequences(raw_ASV_table)))##
## 228 229 233 234 236 247 253 256 257 258 259 260 261 273 290 300
## 8 5 2 1 2 2 1 1 4 4 12 45 1 1 1 2
## 326 340 350 355 361 365 366 367 368 369 370 372 373 374 376 382
## 1 1 4 1 1 34 114 33 37 150 20 2 3 5 3 3
## 383 384 385 386 387 389 390 391 392 394 396 397 398 400 401 402
## 4 4 7 303 2 2 57 161 15 2 1 1 1 2 80 671
## 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
## 339 223 641 92 21 11 10 21 7 9 2 12 5 12 9 13
## 419 420 421 422 423 424 425 426 427 428 429 430 431 432 440
## 9 11 90 2007 76 64 23 664 1870 717 40 9 1 1 1
# Inspect the distribution of sequence lengths of all ASVs in dataset
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Raw distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###################################################
###################################################
# TRIM THE ASVS
# Let's trim the ASVs to only be the right size, which is 426 - 428.
# We will allow for a few
raw_ASV_table_trimmed <- raw_ASV_table[,nchar(colnames(raw_ASV_table)) %in% 421:428]
# Inspect the distribution of sequence lengths of all ASVs in dataset
table(nchar(getSequences(raw_ASV_table_trimmed)))##
## 421 422 423 424 425 426 427 428
## 90 2007 76 64 23 664 1870 717
## [1] 0.7764054
# Inspect the distribution of sequence lengths of all ASVs in dataset
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Trimmed distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Note the peak at 249 is ABOVE 3000
# Let's zoom in on the plot
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Trimmed distribution of ASV length") +
scale_y_continuous(limits = c(0, 500))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Taking into account the lower, zoomed-in plot. Do we want to remove those extra ASVs?
Sometimes chimeras arise in our workflow.
Chimeric sequences are artificial sequences formed by the combination of two or more distinct biological sequences. These chimeric sequences can arise during the polymerase chain reaction (PCR) amplification step of the 16S rRNA gene, where fragments from different templates can be erroneously joined together.
Chimera removal is an essential step in the analysis of 16S sequencing data to improve the accuracy of downstream analyses, such as taxonomic assignment and diversity assessment. It helps to avoid the inclusion of misleading or spurious sequences that could lead to incorrect biological interpretations.
# Remove the chimeras in the raw ASV table
noChimeras_ASV_table <- removeBimeraDenovo(raw_ASV_table_trimmed,
method="consensus",
multithread=TRUE, verbose=TRUE)## Identified 2909 bimeras out of 5511 input sequences.
## [1] 32 2602
## [1] 0.9511942
## [1] 0.7385124
# Plot it
data.frame(Seq_Length_NoChim = nchar(getSequences(noChimeras_ASV_table))) %>%
ggplot(aes(x = Seq_Length_NoChim )) +
geom_histogram()+
labs(title = "Trimmed + Chimera Removal distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Here, we will look at the number of reads that were lost in the filtering, denoising, merging, and chimera removal.
# A little function to identify number seqs
getN <- function(x) sum(getUniques(x))
# Make the table to track the seqs
track <- cbind(filtered_reads,
sapply(dada_forward, getN),
sapply(dada_reverse, getN),
sapply(merged_ASVs, getN),
rowSums(noChimeras_ASV_table))
head(track)## reads.in reads.out
## SRR17060816_1.fastq.gz 285558 78567 77786 77971 75231 56999
## SRR17060817_1.fastq.gz 676817 139571 139211 139234 138140 130614
## SRR17060818_1.fastq.gz 591364 144048 143672 143627 142515 131433
## SRR17060819_1.fastq.gz 379452 101847 101323 101334 100141 59356
## SRR17060820_1.fastq.gz 570270 145512 145202 145267 144552 121708
## SRR17060821_1.fastq.gz 556682 149757 149461 149440 148687 135101
# Update column names to be more informative (most are missing at the moment!)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")
rownames(track) <- samples
# Generate a dataframe to track the reads through our DADA2 pipeline
track_counts_df <-
track %>%
# make it a dataframe
as.data.frame() %>%
rownames_to_column(var = "names") %>%
mutate(perc_reads_retained = 100 * nochim / input)
# Visualize it in table format
DT::datatable(track_counts_df)# Plot it!
track_counts_df %>%
pivot_longer(input:nochim, names_to = "read_type", values_to = "num_reads") %>%
mutate(read_type = fct_relevel(read_type,
"input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")) %>%
ggplot(aes(x = read_type, y = num_reads, fill = read_type)) +
geom_line(aes(group = names), color = "grey") +
geom_point(shape = 21, size = 3, alpha = 0.8) +
scale_fill_brewer(palette = "Spectral") +
labs(x = "Filtering Step", y = "Number of Sequences") +
theme_bw()Here, we will use the silva database version 138!
# The next line took 2 mins to run
taxa_train <-
assignTaxonomy(noChimeras_ASV_table,
"/workdir/in_class_data/taxonomy/silva_nr99_v138.1_train_set.fa.gz",
multithread=6)
# the next line took 3 minutes
taxa_addSpecies <-
addSpecies(taxa_train,
"/workdir/in_class_data/taxonomy/silva_species_assignment_v138.1.fa.gz")
# Inspect the taxonomy
taxa_print <- taxa_addSpecies # Removing sequence rownames for display only
rownames(taxa_print) <- NULL
#View(taxa_print)Below, we will prepare the following:
ASV_fastas: A fasta file that we can use to build a
tree for phylogenetic analyses (e.g. phylogenetic alpha diversity
metrics or UNIFRAC dissimilarty).########### 2. COUNT TABLE ###############
############## Modify the ASV names and then save a fasta file! ##############
# Give headers more manageable names
# First pull the ASV sequences
asv_seqs <- colnames(noChimeras_ASV_table)
asv_seqs[1:5]## [1] "TGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCGAGGAGGAAAGGGATGTTGCTAATATCAGCATCCTGTGACGTTACTCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACA"
## [2] "CCAAGAATATTCCGCAATGGGGGAAACCCTGACGGAGCGACACTGCGTGAATGATGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTGGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA"
## [3] "CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTATGTAGGAAATGACATAATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTAGGGCTCAACTCTAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTGAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA"
## [4] "CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTAGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTTCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA"
## [5] "TGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCAGTGAGGAAGGTAATGTAGTTAATACCTGCATTATTTGACGTTAGCTGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACG"
# make headers for our ASV seq fasta file, which will be our asv names
asv_headers <- vector(dim(noChimeras_ASV_table)[2], mode = "character")
asv_headers[1:5]## [1] "" "" "" "" ""
# loop through vector and fill it in with ASV names
for (i in 1:dim(noChimeras_ASV_table)[2]) {
asv_headers[i] <- paste(">ASV", i, sep = "_")
}
# intitution check
asv_headers[1:5]## [1] ">ASV_1" ">ASV_2" ">ASV_3" ">ASV_4" ">ASV_5"
# Inspect the taxonomy table
#View(taxa_addSpecies)
##### Prepare tax table
# Add the ASV sequences from the rownames to a column
new_tax_tab <-
taxa_addSpecies%>%
as.data.frame() %>%
rownames_to_column(var = "ASVseqs")
head(new_tax_tab)## ASVseqs
## 1 TGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCGAGGAGGAAAGGGATGTTGCTAATATCAGCATCCTGTGACGTTACTCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACA
## 2 CCAAGAATATTCCGCAATGGGGGAAACCCTGACGGAGCGACACTGCGTGAATGATGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTGGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## 3 CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTATGTAGGAAATGACATAATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTAGGGCTCAACTCTAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTGAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## 4 CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTAGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTTCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## 5 TGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCAGTGAGGAAGGTAATGTAGTTAATACCTGCATTATTTGACGTTAGCTGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACG
## 6 TAGGGAATTTTCGGCAATGGGGGAAACCCTGACCGAGCAACGCCGCGTGAATGAAGAAGTATTTCGGTATGTAAAATTCTTTTATTAGGGAAGAATGTCTGTGGTAGGAAATGCCCGCAGAGTGACGGTACCTAATGAATAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCTTGTTAAGTCTGAGGTTAAAGTGCGGGGCTCAACCCCGTGATGCCTTGGAAACTGACAGGCTTGAGTATGGTAGAGGCAAGTGGAATTTCATGTGTAGCGGTAAAATGCGTAAATATATGAAGGAACACCGGTGGCGAAGGCGGCTTGCTGGGCCATTACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACA
## Kingdom Phylum Class Order
## 1 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales
## 2 Bacteria Spirochaetota Brevinematia Brevinematales
## 3 Bacteria Spirochaetota Brevinematia Brevinematales
## 4 Bacteria Spirochaetota Brevinematia Brevinematales
## 5 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## 6 Bacteria Firmicutes Bacilli Acholeplasmatales
## Family Genus Species
## 1 Endozoicomonadaceae Endozoicomonas <NA>
## 2 Brevinemataceae Brevinema <NA>
## 3 Brevinemataceae Brevinema <NA>
## 4 Brevinemataceae Brevinema <NA>
## 5 Vibrionaceae Enterovibrio <NA>
## 6 Acholeplasmataceae DMI <NA>
# intution check
stopifnot(new_tax_tab$ASVseqs == colnames(noChimeras_ASV_table))
# Now let's add the ASV names
rownames(new_tax_tab) <- rownames(asv_tab)
head(new_tax_tab)## ASVseqs
## ASV_1 TGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCGAGGAGGAAAGGGATGTTGCTAATATCAGCATCCTGTGACGTTACTCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACA
## ASV_2 CCAAGAATATTCCGCAATGGGGGAAACCCTGACGGAGCGACACTGCGTGAATGATGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTGGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## ASV_3 CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTATGTAGGAAATGACATAATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTAGGGCTCAACTCTAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTGAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## ASV_4 CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTAGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTTCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## ASV_5 TGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCAGTGAGGAAGGTAATGTAGTTAATACCTGCATTATTTGACGTTAGCTGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACG
## ASV_6 TAGGGAATTTTCGGCAATGGGGGAAACCCTGACCGAGCAACGCCGCGTGAATGAAGAAGTATTTCGGTATGTAAAATTCTTTTATTAGGGAAGAATGTCTGTGGTAGGAAATGCCCGCAGAGTGACGGTACCTAATGAATAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCTTGTTAAGTCTGAGGTTAAAGTGCGGGGCTCAACCCCGTGATGCCTTGGAAACTGACAGGCTTGAGTATGGTAGAGGCAAGTGGAATTTCATGTGTAGCGGTAAAATGCGTAAATATATGAAGGAACACCGGTGGCGAAGGCGGCTTGCTGGGCCATTACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACA
## Kingdom Phylum Class Order
## ASV_1 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales
## ASV_2 Bacteria Spirochaetota Brevinematia Brevinematales
## ASV_3 Bacteria Spirochaetota Brevinematia Brevinematales
## ASV_4 Bacteria Spirochaetota Brevinematia Brevinematales
## ASV_5 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## ASV_6 Bacteria Firmicutes Bacilli Acholeplasmatales
## Family Genus Species
## ASV_1 Endozoicomonadaceae Endozoicomonas <NA>
## ASV_2 Brevinemataceae Brevinema <NA>
## ASV_3 Brevinemataceae Brevinema <NA>
## ASV_4 Brevinemataceae Brevinema <NA>
## ASV_5 Vibrionaceae Enterovibrio <NA>
## ASV_6 Acholeplasmataceae DMI <NA>
### Final prep of tax table. Add new column with ASV names
asv_tax <-
new_tax_tab %>%
# add rownames from count table for phyloseq handoff
mutate(ASV = rownames(asv_tab)) %>%
# Resort the columns with select
dplyr::select(Kingdom, Phylum, Class, Order, Family, Genus, Species, ASV, ASVseqs)
head(asv_tax)## Kingdom Phylum Class Order
## ASV_1 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales
## ASV_2 Bacteria Spirochaetota Brevinematia Brevinematales
## ASV_3 Bacteria Spirochaetota Brevinematia Brevinematales
## ASV_4 Bacteria Spirochaetota Brevinematia Brevinematales
## ASV_5 Bacteria Proteobacteria Gammaproteobacteria Enterobacterales
## ASV_6 Bacteria Firmicutes Bacilli Acholeplasmatales
## Family Genus Species ASV
## ASV_1 Endozoicomonadaceae Endozoicomonas <NA> ASV_1
## ASV_2 Brevinemataceae Brevinema <NA> ASV_2
## ASV_3 Brevinemataceae Brevinema <NA> ASV_3
## ASV_4 Brevinemataceae Brevinema <NA> ASV_4
## ASV_5 Vibrionaceae Enterovibrio <NA> ASV_5
## ASV_6 Acholeplasmataceae DMI <NA> ASV_6
## ASVseqs
## ASV_1 TGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCGAGGAGGAAAGGGATGTTGCTAATATCAGCATCCTGTGACGTTACTCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGTAGGCGGCCTGTTAAGTTGGATGTGAAAGCCCCGGGCTCAACCTGGGAACGGCATCCAAAACTGAGAGGCTCGAGTGCGGAAGAGGAGTGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACACTCTGGTCTGACACTGACGCTGAGGTACGAAAGCGTGGGGAGCAAACA
## ASV_2 CCAAGAATATTCCGCAATGGGGGAAACCCTGACGGAGCGACACTGCGTGAATGATGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTGGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTCCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCCAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## ASV_3 CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTATGTAGGAAATGACATAATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTAGGGCTCAACTCTAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTGAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## ASV_4 CCAAGAATATTCCGCAATGGGGGGAACCCTGACGGAGCGACACTGCGTGAATGAAGAAGGCCTTCGGGTTGTAAAGTTCTTTTATAAAGGAAGAATAAGTTAGGTAGGAAATGACCTGATGATGACGGTACTTTATGAATAAGTCCCGGCTAATTACGTGCCAGCAGCCGCGGTAATACGTAAGGGACGAGCGTTATTCGGAATTACTGGGCGTAAAGGGCGTGTAGGCGGTTAATTAGGCTGAGTGTTAAAGACTGGGGCTCAACTTCAGAAGGGCATTCAGAACCGGTTGACTAGAATCTGGTGGAAGGCAACGGAATTTCCCGTGTAGCGGTGGAATGCATAGATATGGGAAGGAACACCAAAGGCGAAGGCAGTTGTCTATGCTAAGATTGACGCTGAGGCGCGAAAGTGTGGGGATCAAATA
## ASV_5 TGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTCAGCAGTGAGGAAGGTAATGTAGTTAATACCTGCATTATTTGACGTTAGCTGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCGAGCGTTAATCGGAATTACTGGGCGTAAAGCGCATGCAGGCGGTTTGTTAAGCAAGATGTGAAAGCCCCGGGCTTAACCTGGGAATCGCATTTTGAACTGGCAAGCTAGAGTCTTGTAGAGGGGGGTAGAATTTCAGGTGTAGCGGTGAAATGCGTAGAGATCTGAAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGATGCGAAAGCGTGGGGAGCAAACG
## ASV_6 TAGGGAATTTTCGGCAATGGGGGAAACCCTGACCGAGCAACGCCGCGTGAATGAAGAAGTATTTCGGTATGTAAAATTCTTTTATTAGGGAAGAATGTCTGTGGTAGGAAATGCCCGCAGAGTGACGGTACCTAATGAATAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCTTGTTAAGTCTGAGGTTAAAGTGCGGGGCTCAACCCCGTGATGCCTTGGAAACTGACAGGCTTGAGTATGGTAGAGGCAAGTGGAATTTCATGTGTAGCGGTAAAATGCGTAAATATATGAAGGAACACCGGTGGCGAAGGCGGCTTGCTGGGCCATTACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACA
01_DADA2 filesNow, we will write the files! We will write the following to the
data/01_DADA2/ folder. We will save both as files that
could be submitted as supplements AND as .RData objects for easy loading
into the next steps into R.:
ASV_counts.tsv: ASV count table that has ASV names that
are re-written and shortened headers like ASV_1, ASV_2, etc, which will
match the names in our fasta file below. This will also be saved as
data/01_DADA2/ASV_counts.RData.ASV_counts_withSeqNames.tsv: This is generated with the
data object in this file known as noChimeras_ASV_table. ASV
headers include the entire ASV sequence ~250bps. In addition,
we will save this as a .RData object as
data/01_DADA2/noChimeras_ASV_table.RData as we will use
this data in analysis/02_Taxonomic_Assignment.Rmd to assign
the taxonomy from the sequence headers.ASVs.fasta: A fasta file output of the ASV names from
ASV_counts.tsv and the sequences from the ASVs in
ASV_counts_withSeqNames.tsv. A fasta file that we can use
to build a tree for phylogenetic analyses (e.g. phylogenetic alpha
diversity metrics or UNIFRAC dissimilarty).ASVs.fasta in
data/02_TaxAss_FreshTrain/ to be used for the taxonomy
classification in the next step in the workflow.track_read_counts.RData: To track how many reads we
lost throughout our workflow that could be used and plotted later. We
will add this to the metadata in
analysis/02_Taxonomic_Assignment.Rmd.# FIRST, we will save our output as regular files, which will be useful later on.
# Save to regular .tsv file
# Write BOTH the modified and unmodified ASV tables to a file!
# Write count table with ASV numbered names (e.g. ASV_1, ASV_2, etc)
write.table(asv_tab, "data/01_DADA2/07_fullLength/ASV_counts.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write count table with ASV sequence names
write.table(noChimeras_ASV_table, "data/01_DADA2/07_fullLength/ASV_counts_withSeqNames.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write out the fasta file for reference later on for what seq matches what ASV
asv_fasta <- c(rbind(asv_headers, asv_seqs))
# Save to a file!
write(asv_fasta, "data/01_DADA2/07_fullLength/ASVs.fasta")
# SECOND, let's save the taxonomy tables
# Write the table
write.table(asv_tax, "data/01_DADA2/07_fullLength/ASV_taxonomy.tsv", sep = "\t", quote = FALSE, col.names = NA)
# THIRD, let's save to a RData object
# Each of these files will be used in the analysis/02_Taxonomic_Assignment
# RData objects are for easy loading :)
save(noChimeras_ASV_table, file = "data/01_DADA2/07_fullLength/noChimeras_ASV_table.RData")
save(asv_tab, file = "data/01_DADA2/07_fullLength/ASV_counts.RData")
# And save the track_counts_df a R object, which we will merge with metadata information in the next step of the analysis in nalysis/02_Taxonomic_Assignment.
save(track_counts_df, file = "data/01_DADA2/07_fullLength/track_read_counts.RData")##Session information
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.3.2 (2023-10-31)
## os Rocky Linux 9.0 (Blue Onyx)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2024-04-16
## pandoc 3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [2] CRAN (R 4.3.2)
## ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.2)
## ape 5.8 2024-04-11 [1] CRAN (R 4.3.2)
## Biobase 2.62.0 2023-10-24 [2] Bioconductor
## BiocGenerics 0.48.1 2023-11-01 [2] Bioconductor
## BiocParallel 1.36.0 2023-10-24 [2] Bioconductor
## biomformat 1.30.0 2023-10-24 [1] Bioconductor
## Biostrings 2.70.1 2023-10-25 [2] Bioconductor
## bitops 1.0-7 2021-04-24 [2] CRAN (R 4.3.2)
## bslib 0.5.1 2023-08-11 [2] CRAN (R 4.3.2)
## cachem 1.0.8 2023-05-01 [2] CRAN (R 4.3.2)
## callr 3.7.3 2022-11-02 [2] CRAN (R 4.3.2)
## cli 3.6.1 2023-03-23 [2] CRAN (R 4.3.2)
## cluster 2.1.4 2022-08-22 [2] CRAN (R 4.3.2)
## codetools 0.2-19 2023-02-01 [2] CRAN (R 4.3.2)
## colorspace 2.1-0 2023-01-23 [2] CRAN (R 4.3.2)
## crayon 1.5.2 2022-09-29 [2] CRAN (R 4.3.2)
## crosstalk 1.2.0 2021-11-04 [2] CRAN (R 4.3.2)
## dada2 * 1.30.0 2023-10-24 [1] Bioconductor
## data.table 1.14.8 2023-02-17 [2] CRAN (R 4.3.2)
## DelayedArray 0.28.0 2023-10-24 [2] Bioconductor
## deldir 1.0-9 2023-05-17 [2] CRAN (R 4.3.2)
## devtools * 2.4.4 2022-07-20 [2] CRAN (R 4.2.1)
## digest 0.6.33 2023-07-07 [2] CRAN (R 4.3.2)
## dplyr * 1.1.3 2023-09-03 [2] CRAN (R 4.3.2)
## DT * 0.32 2024-02-19 [1] CRAN (R 4.3.2)
## ellipsis 0.3.2 2021-04-29 [2] CRAN (R 4.3.2)
## evaluate 0.23 2023-11-01 [2] CRAN (R 4.3.2)
## fansi 1.0.5 2023-10-08 [2] CRAN (R 4.3.2)
## farver 2.1.1 2022-07-06 [2] CRAN (R 4.3.2)
## fastmap 1.1.1 2023-02-24 [2] CRAN (R 4.3.2)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.2)
## foreach 1.5.2 2022-02-02 [2] CRAN (R 4.3.2)
## fs 1.6.3 2023-07-20 [2] CRAN (R 4.3.2)
## generics 0.1.3 2022-07-05 [2] CRAN (R 4.3.2)
## GenomeInfoDb 1.38.0 2023-10-24 [2] Bioconductor
## GenomeInfoDbData 1.2.11 2023-11-07 [2] Bioconductor
## GenomicAlignments 1.38.0 2023-10-24 [2] Bioconductor
## GenomicRanges 1.54.1 2023-10-29 [2] Bioconductor
## ggplot2 * 3.5.0 2024-02-23 [2] CRAN (R 4.3.2)
## glue 1.6.2 2022-02-24 [2] CRAN (R 4.3.2)
## gtable 0.3.4 2023-08-21 [2] CRAN (R 4.3.2)
## highr 0.10 2022-12-22 [2] CRAN (R 4.3.2)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.2)
## htmltools 0.5.7 2023-11-03 [2] CRAN (R 4.3.2)
## htmlwidgets 1.6.2 2023-03-17 [2] CRAN (R 4.3.2)
## httpuv 1.6.12 2023-10-23 [2] CRAN (R 4.3.2)
## hwriter 1.3.2.1 2022-04-08 [1] CRAN (R 4.3.2)
## igraph 1.5.1 2023-08-10 [2] CRAN (R 4.3.2)
## interp 1.1-6 2024-01-26 [1] CRAN (R 4.3.2)
## IRanges 2.36.0 2023-10-24 [2] Bioconductor
## iterators 1.0.14 2022-02-05 [2] CRAN (R 4.3.2)
## jpeg 0.1-10 2022-11-29 [1] CRAN (R 4.3.2)
## jquerylib 0.1.4 2021-04-26 [2] CRAN (R 4.3.2)
## jsonlite 1.8.7 2023-06-29 [2] CRAN (R 4.3.2)
## knitr 1.45 2023-10-30 [2] CRAN (R 4.3.2)
## labeling 0.4.3 2023-08-29 [2] CRAN (R 4.3.2)
## later 1.3.1 2023-05-02 [2] CRAN (R 4.3.2)
## lattice 0.21-9 2023-10-01 [2] CRAN (R 4.3.2)
## latticeExtra 0.6-30 2022-07-04 [1] CRAN (R 4.3.2)
## lifecycle 1.0.3 2022-10-07 [2] CRAN (R 4.3.2)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.2)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.3.2)
## MASS 7.3-60 2023-05-04 [2] CRAN (R 4.3.2)
## Matrix 1.6-1.1 2023-09-18 [2] CRAN (R 4.3.2)
## MatrixGenerics 1.14.0 2023-10-24 [2] Bioconductor
## matrixStats 1.1.0 2023-11-07 [2] CRAN (R 4.3.2)
## memoise 2.0.1 2021-11-26 [2] CRAN (R 4.3.2)
## mgcv 1.9-0 2023-07-11 [2] CRAN (R 4.3.2)
## mime 0.12 2021-09-28 [2] CRAN (R 4.3.2)
## miniUI 0.1.1.1 2018-05-18 [2] CRAN (R 4.3.2)
## multtest 2.58.0 2023-10-24 [1] Bioconductor
## munsell 0.5.0 2018-06-12 [2] CRAN (R 4.3.2)
## nlme 3.1-163 2023-08-09 [2] CRAN (R 4.3.2)
## pacman 0.5.1 2019-03-11 [1] CRAN (R 4.3.2)
## patchwork * 1.2.0.9000 2024-03-12 [1] Github (thomasp85/patchwork@d943757)
## permute 0.9-7 2022-01-27 [1] CRAN (R 4.3.2)
## phyloseq * 1.41.1 2024-03-09 [1] Github (joey711/phyloseq@c260561)
## pillar 1.9.0 2023-03-22 [2] CRAN (R 4.3.2)
## pkgbuild 1.4.2 2023-06-26 [2] CRAN (R 4.3.2)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.3.2)
## pkgload 1.3.3 2023-09-22 [2] CRAN (R 4.3.2)
## plyr 1.8.9 2023-10-02 [2] CRAN (R 4.3.2)
## png 0.1-8 2022-11-29 [2] CRAN (R 4.3.2)
## prettyunits 1.2.0 2023-09-24 [2] CRAN (R 4.3.2)
## processx 3.8.2 2023-06-30 [2] CRAN (R 4.3.2)
## profvis 0.3.8 2023-05-02 [2] CRAN (R 4.3.2)
## promises 1.2.1 2023-08-10 [2] CRAN (R 4.3.2)
## ps 1.7.5 2023-04-18 [2] CRAN (R 4.3.2)
## purrr * 1.0.2 2023-08-10 [2] CRAN (R 4.3.2)
## R6 2.5.1 2021-08-19 [2] CRAN (R 4.3.2)
## RColorBrewer 1.1-3 2022-04-03 [2] CRAN (R 4.3.2)
## Rcpp * 1.0.11 2023-07-06 [2] CRAN (R 4.3.2)
## RcppParallel 5.1.7 2023-02-27 [2] CRAN (R 4.3.2)
## RCurl 1.98-1.13 2023-11-02 [2] CRAN (R 4.3.2)
## readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.2)
## remotes 2.4.2.1 2023-07-18 [2] CRAN (R 4.3.2)
## reshape2 1.4.4 2020-04-09 [2] CRAN (R 4.3.2)
## rhdf5 2.46.1 2023-11-29 [1] Bioconductor 3.18 (R 4.3.2)
## rhdf5filters 1.14.1 2023-11-06 [1] Bioconductor
## Rhdf5lib 1.24.2 2024-02-07 [1] Bioconductor 3.18 (R 4.3.2)
## rlang 1.1.2 2023-11-04 [2] CRAN (R 4.3.2)
## rmarkdown 2.25 2023-09-18 [2] CRAN (R 4.3.2)
## Rsamtools 2.18.0 2023-10-24 [2] Bioconductor
## rstudioapi 0.15.0 2023-07-07 [2] CRAN (R 4.3.2)
## S4Arrays 1.2.0 2023-10-24 [2] Bioconductor
## S4Vectors 0.40.1 2023-10-26 [2] Bioconductor
## sass 0.4.7 2023-07-15 [2] CRAN (R 4.3.2)
## scales 1.3.0 2023-11-28 [2] CRAN (R 4.3.2)
## sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.3.2)
## shiny 1.7.5.1 2023-10-14 [2] CRAN (R 4.3.2)
## ShortRead 1.60.0 2023-10-24 [1] Bioconductor
## SparseArray 1.2.1 2023-11-05 [2] Bioconductor
## stringi 1.7.12 2023-01-11 [2] CRAN (R 4.3.2)
## stringr * 1.5.0 2022-12-02 [2] CRAN (R 4.3.2)
## SummarizedExperiment 1.32.0 2023-10-24 [2] Bioconductor
## survival 3.5-7 2023-08-14 [2] CRAN (R 4.3.2)
## tibble * 3.2.1 2023-03-20 [2] CRAN (R 4.3.2)
## tidyr * 1.3.0 2023-01-24 [2] CRAN (R 4.3.2)
## tidyselect 1.2.0 2022-10-10 [2] CRAN (R 4.3.2)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.2)
## timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.2)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.2)
## urlchecker 1.0.1 2021-11-30 [2] CRAN (R 4.3.2)
## usethis * 2.2.2 2023-07-06 [2] CRAN (R 4.3.2)
## utf8 1.2.4 2023-10-22 [2] CRAN (R 4.3.2)
## vctrs 0.6.4 2023-10-12 [2] CRAN (R 4.3.2)
## vegan 2.6-4 2022-10-11 [1] CRAN (R 4.3.2)
## withr 2.5.2 2023-10-30 [2] CRAN (R 4.3.2)
## xfun 0.41 2023-11-01 [2] CRAN (R 4.3.2)
## xtable 1.8-4 2019-04-21 [2] CRAN (R 4.3.2)
## XVector 0.42.0 2023-10-24 [2] Bioconductor
## yaml 2.3.7 2023-01-23 [2] CRAN (R 4.3.2)
## zlibbioc 1.48.0 2023-10-24 [2] Bioconductor
##
## [1] /home/cab565/R/x86_64-pc-linux-gnu-library/4.3
## [2] /programs/R-4.3.2/library
##
## ──────────────────────────────────────────────────────────────────────────────